library(dplyr)
library(psych)
library(fastDummies)



recode_demographic_variables_survey_one <- function(survey_data){
  # add descriptive names for categorical variables 
  
  survey_data$resp_occupation <- recode(survey_data$resp_occupation,
       "1" = "Self-Employed",
       "2" = "Business Owner",
       "3" = "Full-Time Employee", 
       "4" = "Part-time Employee",
       "5" = "Temporary Worker",
       "6" = "Contract Worker",
       "7" = "Student", 
       "8" = "Homemaker",
       "9" = "Unemployed",
       "10" = "Other",
       "99" = "Don't Want to Answer")
  
  survey_data$nenshu <- recode(survey_data$nenshu,
         "1" = "2M Yen or Less",
         "2" = "2M-4M Yen",
         "3" = "4M-6M Yen",
         "4" = "6M-8M Yen",
         "5" = "8M-10M Yen",
         "6" = "10M-12M Yen",
         "7" = "12M-15M Yen",
         "8" = "15M+ Yen",
         "99" = "Don't Want to Answer")
  
  survey_data$gakureki_resp <- recode(survey_data$gakureki_resp,
                                            "1" = "Elementary or Middle School",
                                            "2" = "High School",
                                            "3" = "Vocational School", 
                                            "4" = "2 Year University",
                                            "5" = "4 Year University",
                                            "6" = "Graduate School",
                                            "99" = "Don't Want to Answer")
  
  survey_data$kekkon <- recode(survey_data$kekkon,
                                     "1" = "Unmarried",
                                     "2" = "Married",
                                     "3" = "Separated", 
                                     "4" = "Widowed",
                                     "99" = "Don't Want to Answer")
  
  survey_data$city_size <- recode(survey_data$city_size,
                                     "3" = "Tokyo Wards, Designated Cities",
                                     "2" = "Other City",
                                     "1" = "Village", 
                                     "99" = "Don't Want to Answer")
  
  return(survey_data)
  
}

recode_demographic_variables_survey_two <- function(survey_data){
  # add descriptive names for categorical variables
  survey_data <- survey_data %>% 
    mutate("Gender: " = recode(Q22, 
                                      "0" = "Male",
                                      "1" = "Female",
                                      "99" = "Don't Want to Answer",
                                      .default = "Don't Want to Answer"),
           "Age" = 2021 - Q24,
           "Marrital Status: " = recode(Q28, 
                                        "1" = "Unmarried",
                                        "2" = "Married",
                                        "3" = "Separated", 
                                        "4" = "Widowed",
                                        "99" = "Don't Want to Answer"),
           "Education Level: " = recode(Q223,
                                        "1" = "High School or Less",
                                        "2" = "Vocational School", 
                                        "3" = "2 Year University",
                                        "4" = "4 Year University",
                                        "5" = "Graduate School",
                                        "99" = "Don't Want to Answer"),
           "Occupation: " = recode(Q32,
                                   "1" = "Self-Employed",
                                   "2" = "Business Owner",
                                   "3" = "Full-Time Employee", 
                                   "4" = "Part-time Employee",
                                   "5" = "Temporary Worker",
                                   "6" = "Contract Worker",
                                   "7" = "Student", 
                                   "8" = "Homemaker",
                                   "9" = "Unemployed",
                                   "10" = "Other",
                                   "99" = "Don't Want to Answer"),
           "Annual Income: " = recode(Q34,
                               "1" = "2M Yen or Less",
                               "2" = "2M-4M Yen",
                               "3" = "4M-6M Yen",
                               "4" = "6M-8M Yen",
                               "5" = "8M-10M Yen",
                               "6" = "10M-12M Yen",
                               "7" = "12M-15M Yen",
                               "8" = "15M+ Yen",
                               "99" = "Don't Want to Answer"),
           "City Size: " = recode(Q211,
                                  "3" = "Tokyo Wards, Designated Cities",
                                  "2" = "Other City",
                                  "1" = "Village", 
                                  `NA` = "Don't Want to Answer",
                                  "99" = "Don't Want to Answer"))
  
  return(survey_data)
}


first_survey_file <- "data/first_survey_formatted.rds"
second_survey_file <- "data/follow_up_survey_formatted.rds"

first_survey_data <- readRDS(first_survey_file)

# keeps one row per respondent (conjoint data format has 1 row per task)
first_survey_data <- first_survey_data %>% 
  distinct(ResponseId, .keep_all = TRUE)


first_survey_relevant_variables <- c(
  "male_resp",
  "city_size",
  "birth_year", 
  "kekkon", 
  "resp_occupation",
  "nenshu",
  "gakureki_resp"
)

first_survey_data$birth_year <- as.numeric(first_survey_data$birth_year)
first_survey_data$male_resp <- as.numeric(first_survey_data$male_resp) - 1

first_survey_data <- first_survey_data %>% 
  select(all_of(first_survey_relevant_variables))

first_survey_data <- first_survey_data %>% 
  mutate(age = 2021 - birth_year)

first_survey_data <- recode_demographic_variables_survey_one(first_survey_data)

first_survey_data <- first_survey_data %>% 
  rename("City Size: " = "city_size",
         "Marrital Status: " = "kekkon",
         "Annual Income: " = "nenshu",
         "Occupation: " = "resp_occupation",
         "Education Level: " = "gakureki_resp",
         "Male Respondent" = "male_resp",
         "Age" = "age")

# turns categorical variable into multiple dummy variables 
descriptive_table_data <- fastDummies::dummy_cols(first_survey_data, select_columns = c("City Size: ",
                                                              "Marrital Status: ",
                                                              "Annual Income: ",
                                                              "Occupation: ",
                                                              "Education Level: "), 
                        remove_selected_columns = TRUE)

colnames(descriptive_table_data) <- stringr::str_remove(colnames(descriptive_table_data), "_")
  

descriptive_table <- descriptive_table_data %>% 
  select(-birthyear) %>% 
  psych::describe() %>% 
  select(mean, sd, median, min, max)
  
kableExtra::kable(descriptive_table, format = "latex", digits = 2)


second_survey_data <- readRDS(second_survey_file)
second_survey_data[second_survey_data == ""] <- NA
second_survey_data$Q24 <- as.numeric(second_survey_data$Q24)

second_survey_data <- recode_demographic_variables_survey_two(second_survey_data)
second_survey_data <- second_survey_data %>% 
  select(all_of(
    c("Age",
      "Gender: ",
    "City Size: ",
    "Marrital Status: ",
    "Annual Income: ",
    "Occupation: ",
    "Education Level: ")
  ))


second_survey_data[is.na(second_survey_data)] <- "Don't Want to Answer"
second_survey_data$Age[second_survey_data$Age == "Don't Want to Answer"] <- NA
second_survey_data$Age <- as.numeric(second_survey_data$Age)

second_survey_descriptive_table_data <- fastDummies::dummy_cols(second_survey_data, select_columns = c("City Size: ",
                                                                                                       "Gender: ",
                                                              "Marrital Status: ",
                                                              "Annual Income: ",
                                                              "Occupation: ",
                                                              "Education Level: "), 
                        remove_selected_columns = TRUE)

colnames(second_survey_descriptive_table_data) <- stringr::str_remove(colnames(second_survey_descriptive_table_data), "_")


# calculate statistics for each variable 
descriptive_table_second_survey <- second_survey_descriptive_table_data %>% 
  psych::describe() %>% 
  select(mean, sd, median, min, max)

print(kableExtra::kable(descriptive_table_second_survey, format = "latex", digits = 2))

